#
# packages_to_install <- c('tidyverse', 'ggplot2', 'lme4') # main ones
# install.packages(packages_to_install)
#
# optional_package<- c('heplots','ggeffects','ggpubr','visreg', 'lattice', 'psych', '')
# install.packages(optional_package)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk along text.
This is a R markdown, it is like a jupyter notebook or matlab live-script. It can incorporate blocks of text with blocks of R-script.
The schedule is to first introduce common data types and variables, followed by functions and libraries for plotting. Afterwards we would apply typical analyses like anovas and mixed effect models to tutorial datasets. We will conclude by showing how to plot results of this data, as well as simulate data for instance for power testing.
This can be used as a cheat sheet, but the main thing to remember is that functions would act differently depending on data types, so it is important to know what datatypes you have in to the data.
# Basic types
num <- 42 # number can be double
char <- "hello" # string
bool <- TRUE # logical variable that you can use as a condition or for filtering
# Vectors
vec <- c(1, 2, 3, 4)
vec[1]
## [1] 1
vec[0]
## numeric(0)
vec[5]
## [1] NA
# Named vector
(vec <- c('height'=2, 'weight'=3, 'gender'=0))
## height weight gender
## 2 3 0
vec['height']
## height
## 2
names(vec) <- c('H','W','G')
vec
## H W G
## 2 3 0
# Lists - can hold different types of variables, e.g., list of vectors, lists, or dataframes. can be useful when saving a json file.
lst <- list(name="Boris Johnson", age=54, scores=c(88, 90, 95))
# Matrices
mat <- matrix(1:9, nrow=3)
# Data Frames
student_data <- data.frame(
name = c("Ayse", "Ali", "Ece"),
score = c(90, 85, 88)
)
(vec <- c(1:10))
## [1] 1 2 3 4 5 6 7 8 9 10
vec[1]
## [1] 1
vec[0]
## integer(0)
vec[20]
## [1] NA
# Named vector
(vec <- c('height'=2, 'weight'=3, 'gender'=0))
## height weight gender
## 2 3 0
vec['height']
## height
## 2
names(vec) <- c('H','W','G')
vec
## H W G
## 2 3 0
vec[2]
## W
## 3
vec[[2]]
## [1] 3
# read this for more detailed explanation - https://alexander-pastukhov.github.io/data-analysis-using-r-for-psychology/tables.html
(vec <- c(1:10))
## [1] 1 2 3 4 5 6 7 8 9 10
vec[c(1,2,3)]
## [1] 1 2 3
vec[c(1,1,1,1,1,2,3)]
## [1] 1 1 1 1 1 2 3
vec[c(-1,-2,-3)] # all but 1st 2nd, 3rd
## [1] 4 5 6 7 8 9 10
5:2
## [1] 5 4 3 2
seq(5,2,by=-1)
## [1] 5 4 3 2
## you can add vectors, but weirdly you don't get an error if there are not of different length
# Unfortunately, R has a solution and that is not an error or a warning. Rather, if the length of the longer vector is a multiple of the shorter vector length, the shorter vector is “recycled” , i.e., repeated N-times (where N = length (longer vector) / length (shorter vector) )
x <- 1:6
y <- c(2, 3)
print(x + y)
## [1] 3 5 5 7 7 9
Variables can be useful e.g., when we want to iterate or apply a function to each element in a vector or organised data into a data frame. You can perform simple arithmetic operations on variables, which can be useful when you want to do something not available in a package.
a = 5
b = 2
print(a+b)
## [1] 7
print(a*b)
## [1] 10
print(paste('a to the power of b',a^b))
## [1] "a to the power of b 25"
print(paste('string one', 'string two')) # our first use of a function, paste can concatenate strings, by default separated by space
## [1] "string one string two"
# note that paste converted the number a^b to a character
# you could often convert simple variables in format, which may be useful later on
print(a)
## [1] 5
print(as.character(a))
## [1] "5"
TEMP <- "51"
print(TEMP)
## [1] "51"
print(as.numeric(TEMP))
## [1] 51
correct <- 1
print(correct)
## [1] 1
print(as.logical(correct))
## [1] TRUE
## you can provide two vectors of sample length to paste and it will concatenate them accordingly
(nth <- paste(1:12)) # outer brackets are a short hand for print - function
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
(nth<- paste(1:12, c("st", "nd", "rd", rep("th",9)))) # ok but we do not want space between the two strings, so we can give the function an argument to do this first lets check the help menu ?paste - this would often show arguments for a function, we see something called sep - which controls the separation between strings
## [1] "1 st" "2 nd" "3 rd" "4 th" "5 th" "6 th" "7 th" "8 th" "9 th" "10 th" "11 th" "12 th"
(nth<- paste(1:12, c("st", "nd", "rd", rep("th",9)), sep="")) # in R sometimes there are short hands for commonly used functions, e.g., paste0 takes the default value of sep="", so
## [1] "1st" "2nd" "3rd" "4th" "5th" "6th" "7th" "8th" "9th" "10th" "11th" "12th"
(nth<- paste0(1:12, c("st", "nd", "rd", rep("th",9))))
## [1] "1st" "2nd" "3rd" "4th" "5th" "6th" "7th" "8th" "9th" "10th" "11th" "12th"
Working with strings can be particularly useful when we want to deal with file names, but before I show that I want to mention some simple functions that can be particularly helpful when combining variables together We already saw one of these functions - rep
(rep(1,2)) # this can repeat a variable multiple times, so for example if we had a vector of 1s (males) and 0s (females) we could repeat it to match our data. Lets say we have data from 10 males and 10 females
## [1] 1 1
(rep(c(1,0),10)) # a but this repeated the whole vector 10 times, maybe we have first the 10 males and later 10 females
## [1] 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
(rep(c(1,0),each=10)) # ok and what if now we collect another 20 people in same order
## [1] 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
(temp_gender <- rep(c(1,0),each=10,times=2))
## [1] 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
# rep works also with factors or strings
gender <-c("male","female")
(rep(gender, each=10))
## [1] "male" "male" "male" "male" "male" "male" "male" "male" "male" "male" "female" "female" "female"
## [14] "female" "female" "female" "female" "female" "female" "female"
gender_factor <-factor(temp_gender, levels = c(1,0), labels = c("male","female"))
summary(gender_factor)
## male female
## 20 20
unclass(gender_factor)
## [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## attr(,"levels")
## [1] "male" "female"
Now that we have a vector temp_gender we may want to manipulate it. We can change it with maths or with logical indexing - this is the most time-consuming bits often in research is clearning and organising the data. I will first show the most common opperations with basic R since that would be most similar to other programming languages like python and matlab. I will then try and show some short hands that can make things a bit easier to code but achieve the same Because we have a vector we can either do an operation to every element in the vector or what most often we want is to do some operations for some elements and another operation for other elements. For instance, we may want to recode our variables to be -1 for males and 1 for females.
temp_gender_new <- numeric(length(temp_gender))
# we can interate over numbers
for (i in 1:5 ) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
# we can assign conditions to do something only when a condition is met - here when the i is odd rather than even
for (i in 1:5 ) {
if (i %% 2){
print(i)
}
}
## [1] 1
## [1] 3
## [1] 5
# Loop through each element in temp_gender
for (i in seq_along(temp_gender)) {
if (temp_gender[i] == 0) {
temp_gender_new[i] <- -1 # if value is 0, assign -1
} else {
temp_gender_new[i] <- 1 # otherwise, assign 1
}
}
(rbind(temp_gender,temp_gender_new)) # stack rows
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19]
## temp_gender 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
## temp_gender_new 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
## temp_gender 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
## temp_gender_new -1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1
## [,38] [,39] [,40]
## temp_gender 0 0 0
## temp_gender_new -1 -1 -1
### ok you might be wondering if there is a short hand as this is something likely we would have to do quite often and there is
temp_gender_new <- ifelse(temp_gender == 0, -1 , 1) # what this does is if temp_gender variable is equal to 0 we make it -1 otherwise we code everything else as 1
# although we did this with for loops and if conditions, for this specific situation the easiest may be a simple arithmetic solution - multiplication by scaler performs the operation on each element
temp_gender_recode <- temp_gender*2 - 1
(rbind(temp_gender, temp_gender_new, temp_gender_recode))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19]
## temp_gender 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
## temp_gender_new 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## temp_gender_recode 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## temp_gender 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
## temp_gender_new -1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1
## temp_gender_recode -1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1
## [,37] [,38] [,39] [,40]
## temp_gender 0 0 0 0
## temp_gender_new -1 -1 -1 -1
## temp_gender_recode -1 -1 -1 -1
As you can see there multiple ways to achieve the same goal, this is very common in programming, especially in open software where different people might have created different packages that do basically the same thing. It can be tricky as sometimes the packages appear to do the same things but there are subtle differences, often this is easier to detect when re-formating data, but may be more subtle when working with statistical packages e.g., that calcualte effect size or marignal means. But for now back on task, we now know the basics of programming that we would need in most psychology situations. We know how to iterate over things and provide conditions, what we do not know is how to write our own functions.
# often there would be some pre-made function that does what you need, but sometimes it may be useful to use a function to wrap around a piece of code you use often
# we I simply show the syntax of a very simple function, but later on the workshop we will use a bit more useful one.
# main points are we can define something we pass it arguments, (x,y), you make some computation and return the output.
add_two_numbers <- function(x=1, y=1) {
result <- x + y
return(result)
}
print(add_two_numbers())
## [1] 2
print(add_two_numbers(x=5, y =10))
## [1] 15
Ok after seeing some basics of programming it may be a good time to actually start doing some things we might encounter in the real world. I in my opinion doing something is the best way to learn, you should not be too scared of making mistakes and the syntax is something internet or chatGPT can help you with. There are extensive resources online to introduction to R showing how to use different datatypes, vectors and functions, here I have listed some of them.
Most famous ones are the Navarro R book - https://learningstatisticswithr.com/ For people with no experience with R - https://bookdown.org/rdpeng/rprogdatascience/
https://intro2r.com/ https://www.w3schools.com/r/r_intro.asp https://r4ds.had.co.nz/index.html https://rpubs.com/ruruu127/417821 https://alexander-pastukhov.github.io/data-analysis-using-r-for-psychology/index.html - Aimed at Psychologists
tidyverse is a package in R that has gained popularity, it is effectively a short handed way to achieve what you can do also with base R with more code. Some people love it, some people hate it. My opinion is that base R is more similar to other languages so it may be easier to read. The “speed” you gain from running things in tidyverse is unlikely to be important for most psychologists. It can act a bit too much as a black box when you are learning things. However, it can be easier sometimes for plotting and data-wrangling. I will try and focus on base R in this tutorial but will try an include tidyverse alternatives to give an introduction to it as well. One thing to note is that many packages may have functions, operators (e.g., *, %>%) that are named the same so they can interact. This means when using multiple packages one has to be slighly careful and expect sometimes errors.
https://www.tidyverse.org/learn/ https://bookdown.org/yih_huynh/Guide-to-R-Book/tidyverse.html
However, reading too much into “theory” may not be useful, as quite likely one will forget exactly what command performed a certain operation unless they do it repeatedly.
# Number of subjects
n_subjects <- 20
# Loop over subject numbers
for (i in 1:n_subjects) {
# Create a subject ID like "sub-01"
sub_id <- sprintf("sub-%02d", i)
# Create a folder for the subject
dir.create(sub_id, showWarnings = FALSE)
# Create a simple data frame (optional content)
demo_data <- data.frame(
subject = sub_id,
age = sample(18:65, 1),
condition = sample(c("control", "treatment"), 1)
)
# Define file path: sub-01/sub-01.csv
file_path <- file.path(sub_id, paste0(sub_id, ".csv"))
# Write the CSV file
write.csv(demo_data, file = file_path, row.names = FALSE)
}
main_dir <- (".")
subject_folders <- list.dirs(main_dir, full.names = TRUE, recursive = FALSE) # or better yet to use a pattern
# from grep can use regular expressions
folder_list <- gsub("\\./", "",subject_folders)
## alternatively
(subject_folders <- list.files(main_dir, full.names = TRUE, recursive = FALSE, pattern="sub")) # or better yet to use a pattern
## [1] "./sub-01" "./sub-02" "./sub-03" "./sub-04" "./sub-05" "./sub-06" "./sub-07" "./sub-08" "./sub-09" "./sub-10" "./sub-11"
## [12] "./sub-12" "./sub-13" "./sub-14" "./sub-15" "./sub-16" "./sub-17" "./sub-18" "./sub-19" "./sub-20"
all_data_list <- list()
# Loop through each subject folder
for (folder in subject_folders) {
# Find CSV files in that folder
csv_files <- list.files(folder, pattern ="^sub-\\d{2}\\.csv$", full.names = TRUE) # - "\\.csv$" - find any csv file; - find csv file that matches pattern sub-{2 numbers}.csv
# Only proceed if at least one CSV is found
if (length(csv_files) > 0) {
# (Assume the first one is the subject CSV)
csv_path <- csv_files[1]
# Read it
df <- read.csv(csv_path)
# Add subject info from folder name
df$subject <- basename(folder)
# Append to list
all_data_list[[basename(folder)]] <- df
} else {
warning(paste("No CSV found in", folder))
}
}
# Combine all into one data frame
combined_data <- do.call(rbind, all_data_list)
# Show result
head(combined_data)
## subject age condition
## sub-01 sub-01 61 treatment
## sub-02 sub-02 23 control
## sub-03 sub-03 65 control
## sub-04 sub-04 32 control
## sub-05 sub-05 57 control
## sub-06 sub-06 32 treatment
Can we achieve the same with less code?
library(tidyverse)
## ── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 1.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
main_dir <- "."
# Get all subject folders that start with "sub-"
subject_dirs <- list.dirs(main_dir, recursive = FALSE) %>%
keep(~ str_detect(basename(.x), "^sub-\\d+"))
# temp_name <- list.dirs(main_dir, recursive = FALSE)
#k eep(temp_name, ~ str_detect(basename(.x), "^sub-\\d+"))
# Read and combine the first .csv file from each subject folder
combined_data <- subject_dirs %>%
map_dfr(function(folder) {
csv_file <- list.files(folder, pattern = "\\.csv$", full.names = TRUE)[1]
if (!is.na(csv_file) && file.exists(csv_file)) {
read_csv(csv_file) %>%
mutate(subject = basename(folder))
}
})
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
##
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## subject = col_character(),
## age = col_double(),
## condition = col_character()
## )
head(combined_data)
## # A tibble: 6 × 3
## subject age condition
## <chr> <dbl> <chr>
## 1 sub-01 61 treatment
## 2 sub-02 23 control
## 3 sub-03 65 control
## 4 sub-04 32 control
## 5 sub-05 57 control
## 6 sub-06 32 treatment
We achieved the same goal with less code, but there are a lot of things going on that may be difficult to follow so I will provide a bit more general introduction to tidyverse here The basic idea of the package is to apply functions to elements of data.frames more easily and automatically, without us having to do the for loops Lets re-start a new data so we can show some of the functionality of tidyverse - which is a large collection of packages most common being ggplot and dplyr - they can be installed separately if you have issues with install.packages(“tidyverse”)
We will demonstrate some of the main features and plotting using the iris dataset iris is a tidy dataset. Each row is an observation of an individual iris, each column is a different variable. This can be thought of as long form data (foreshadowing) - Basic idea is that each row might be a trial from experimental paradigm, or a subject, and each column could be a variable, e.g., Condition, Reaction Time, Age, Gender, etc. This format of storing data is very common throughout statistics and is not really specific to R
— see readxl for reading excel data e.g., read_excel(“my_excel_file.xls”, sheet=2) / read_excel(“my_excel_file.xls”, sheet=“addendum”) — saveRDS and readRDS can be used when you want to save a vector or R object that is not a csv
print(dim(iris))
## [1] 150 5
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
print(class(iris))
## [1] "data.frame"
colnames(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
glimpse(iris)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4…
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4…
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7…
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2…
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa,…
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
## Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# this is a cheat to call a function from a package without loading the full package
psych::describe(iris)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Sepal.Length 1 150 5.84 0.83 5.80 5.81 1.04 4.3 7.9 3.6 0.31 -0.61 0.07
## Sepal.Width 2 150 3.06 0.44 3.00 3.04 0.44 2.0 4.4 2.4 0.31 0.14 0.04
## Petal.Length 3 150 3.76 1.77 4.35 3.76 1.85 1.0 6.9 5.9 -0.27 -1.42 0.14
## Petal.Width 4 150 1.20 0.76 1.30 1.18 1.04 0.1 2.5 2.4 -0.10 -1.36 0.06
## Species* 5 150 2.00 0.82 2.00 2.00 1.48 1.0 3.0 2.0 0.00 -1.52 0.07
# One can grab data from specific rows or columns but simply indexing the data e.g., 2nd row 3rd column
iris[2,3]
## [1] 1.4
iris$Sepal.Length # grab whole column
## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7
## [31] 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2
## [61] 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5
## [91] 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0
## [121] 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
iris['Sepal.Length']
## Sepal.Length
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5.0
## 6 5.4
## 7 4.6
## 8 5.0
## 9 4.4
## 10 4.9
## 11 5.4
## 12 4.8
## 13 4.8
## 14 4.3
## 15 5.8
## 16 5.7
## 17 5.4
## 18 5.1
## 19 5.7
## 20 5.1
## 21 5.4
## 22 5.1
## 23 4.6
## 24 5.1
## 25 4.8
## 26 5.0
## 27 5.0
## 28 5.2
## 29 5.2
## 30 4.7
## 31 4.8
## 32 5.4
## 33 5.2
## 34 5.5
## 35 4.9
## 36 5.0
## 37 5.5
## 38 4.9
## 39 4.4
## 40 5.1
## 41 5.0
## 42 4.5
## 43 4.4
## 44 5.0
## 45 5.1
## 46 4.8
## 47 5.1
## 48 4.6
## 49 5.3
## 50 5.0
## 51 7.0
## 52 6.4
## 53 6.9
## 54 5.5
## 55 6.5
## 56 5.7
## 57 6.3
## 58 4.9
## 59 6.6
## 60 5.2
## 61 5.0
## 62 5.9
## 63 6.0
## 64 6.1
## 65 5.6
## 66 6.7
## 67 5.6
## 68 5.8
## 69 6.2
## 70 5.6
## 71 5.9
## 72 6.1
## 73 6.3
## 74 6.1
## 75 6.4
## 76 6.6
## 77 6.8
## 78 6.7
## 79 6.0
## 80 5.7
## 81 5.5
## 82 5.5
## 83 5.8
## 84 6.0
## 85 5.4
## 86 6.0
## 87 6.7
## 88 6.3
## 89 5.6
## 90 5.5
## 91 5.5
## 92 6.1
## 93 5.8
## 94 5.0
## 95 5.6
## 96 5.7
## 97 5.7
## 98 6.2
## 99 5.1
## 100 5.7
## 101 6.3
## 102 5.8
## 103 7.1
## 104 6.3
## 105 6.5
## 106 7.6
## 107 4.9
## 108 7.3
## 109 6.7
## 110 7.2
## 111 6.5
## 112 6.4
## 113 6.8
## 114 5.7
## 115 5.8
## 116 6.4
## 117 6.5
## 118 7.7
## 119 7.7
## 120 6.0
## 121 6.9
## 122 5.6
## 123 7.7
## 124 6.3
## 125 6.7
## 126 7.2
## 127 6.2
## 128 6.1
## 129 6.4
## 130 7.2
## 131 7.4
## 132 7.9
## 133 6.4
## 134 6.3
## 135 6.1
## 136 7.7
## 137 6.3
## 138 6.4
## 139 6.0
## 140 6.9
## 141 6.7
## 142 6.9
## 143 5.8
## 144 6.8
## 145 6.7
## 146 6.7
## 147 6.3
## 148 6.5
## 149 6.2
## 150 5.9
iris[['Sepal.Length']]
## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7
## [31] 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2
## [61] 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5
## [91] 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0
## [121] 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
(iris[,1]) # grab whole column but use numeric index
## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7
## [31] 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2
## [61] 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5
## [91] 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0
## [121] 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
head(iris[,-1]) # grab all columns but first one
## Sepal.Width Petal.Length Petal.Width Species
## 1 3.5 1.4 0.2 setosa
## 2 3.0 1.4 0.2 setosa
## 3 3.2 1.3 0.2 setosa
## 4 3.1 1.5 0.2 setosa
## 5 3.6 1.4 0.2 setosa
## 6 3.9 1.7 0.4 setosa
iris[iris$Species == "setosa", ] # grab all columns but focus on rows that are from the setosa species
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
(iris[iris$Species == "setosa" | iris$Species == "virginica", ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
# Often we would have some missing values that we have to find
iris_copy <- iris
make_missing <- sample(c(1,0),size = nrow(iris_copy), prob = c(0.10,0.90), replace = TRUE) # this samples 1, and 0 with replacement and says 1 has 90% chance of occurring and 0 - 10% chance of occurring
iris_copy$Sepal.Length[as.logical(make_missing)] <- NA
(missing_values <- is.na(iris_copy$Sepal.Length))
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [21] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [81] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
## [141] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
print(sum(missing_values)/nrow(iris_copy))
## [1] 0.09333333
# ok what is the mean length
mean(iris$Sepal.Length, na.rm = TRUE) # na.rm means ignore nan values
## [1] 5.843333
mean(iris$Sepal.Length[iris$Species == "setosa"], na.rm = TRUE)
## [1] 5.006
mean(iris[iris$Species == "setosa", "Petal.Length"], na.rm = TRUE)
## [1] 1.462
## ok but we can also achieve this with piping which is very common in the tidyverse
# piping is a common programming trick where we feed the result of one operation into another
iris$Sepal.Length %>% mean()
## [1] 5.843333
# This may look pointless but it can be quite helpful when you want to perform multiple functions together
# lets say we have repeats we may want to calculate the mean of unique values
mean(unique(iris$Sepal.Length))
## [1] 6.011429
iris$Sepal.Length %>% unique() %>% mean() %>% round(digits=3)
## [1] 6.011
## %>% - can be done with ctrl + shift + m / or command + shift + m
# z <- 4
# w <- 3
# z %>% radius(w) # is the same as radius(z,w) - piped output is first argument
# z %>% radius(x = w, y = .) # is radius(w,z) - you can use the dot to specify place for piped output
## remember this
## subject_dirs <- list.dirs(main_dir, recursive = FALSE) %>%
## keep(~ str_detect(basename(.x), "^sub-\\d+"))
# we are feeding the results of list.dirs the subject folders to another function that will apply operations to each element, but we are getting ahead of ourselves
Lets do some data wrangling - see https://rpubs.com/Earlien/Wrangling dplyr is a package that makes part of the larger tidyverse. dplyr is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges. - https://dplyr.tidyverse.org/ As you can imagine base R can do everything that tidyverse can, but it can become a bit hairy when we want basic data operations that are common
library(dplyr)
# lets say we may want to select only certain columns - select is a good way to do this
iris_select <- iris %>% select(Species, Petal.Length, Petal.Width)
head(iris_select)
## Species Petal.Length Petal.Width
## 1 setosa 1.4 0.2
## 2 setosa 1.4 0.2
## 3 setosa 1.3 0.2
## 4 setosa 1.5 0.2
## 5 setosa 1.4 0.2
## 6 setosa 1.7 0.4
# ok but what if we do not want to list all the names e.g., imagine you have more than 20 columns - you can use some helper functions
iris_select <- iris %>% select(Species, matches("Width")) # any coulmn that contains Width
head(iris_select)
## Species Sepal.Width Petal.Width
## 1 setosa 3.5 0.2
## 2 setosa 3.0 0.2
## 3 setosa 3.2 0.2
## 4 setosa 3.1 0.2
## 5 setosa 3.6 0.2
## 6 setosa 3.9 0.4
iris_select <- iris %>% select(Species, contains("Sepal")) # any coulmn that contains Width
head(iris_select)
## Species Sepal.Length Sepal.Width
## 1 setosa 5.1 3.5
## 2 setosa 4.9 3.0
## 3 setosa 4.7 3.2
## 4 setosa 4.6 3.1
## 5 setosa 5.0 3.6
## 6 setosa 5.4 3.9
iris_select <- iris %>% select(Species, starts_with("Petal"))
head(iris_select)
## Species Petal.Length Petal.Width
## 1 setosa 1.4 0.2
## 2 setosa 1.4 0.2
## 3 setosa 1.3 0.2
## 4 setosa 1.5 0.2
## 5 setosa 1.4 0.2
## 6 setosa 1.7 0.4
iris_select <- iris %>% select(Species, ends_with("Length"))
head(iris_select)
## Species Sepal.Length Petal.Length
## 1 setosa 5.1 1.4
## 2 setosa 4.9 1.4
## 3 setosa 4.7 1.3
## 4 setosa 4.6 1.5
## 5 setosa 5.0 1.4
## 6 setosa 5.4 1.7
iris %>% select(matches("[A-Z]\\."))# any column that has capital letter followed by a dot
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
## 7 4.6 3.4 1.4 0.3
## 8 5.0 3.4 1.5 0.2
## 9 4.4 2.9 1.4 0.2
## 10 4.9 3.1 1.5 0.1
## 11 5.4 3.7 1.5 0.2
## 12 4.8 3.4 1.6 0.2
## 13 4.8 3.0 1.4 0.1
## 14 4.3 3.0 1.1 0.1
## 15 5.8 4.0 1.2 0.2
## 16 5.7 4.4 1.5 0.4
## 17 5.4 3.9 1.3 0.4
## 18 5.1 3.5 1.4 0.3
## 19 5.7 3.8 1.7 0.3
## 20 5.1 3.8 1.5 0.3
## 21 5.4 3.4 1.7 0.2
## 22 5.1 3.7 1.5 0.4
## 23 4.6 3.6 1.0 0.2
## 24 5.1 3.3 1.7 0.5
## 25 4.8 3.4 1.9 0.2
## 26 5.0 3.0 1.6 0.2
## 27 5.0 3.4 1.6 0.4
## 28 5.2 3.5 1.5 0.2
## 29 5.2 3.4 1.4 0.2
## 30 4.7 3.2 1.6 0.2
## 31 4.8 3.1 1.6 0.2
## 32 5.4 3.4 1.5 0.4
## 33 5.2 4.1 1.5 0.1
## 34 5.5 4.2 1.4 0.2
## 35 4.9 3.1 1.5 0.2
## 36 5.0 3.2 1.2 0.2
## 37 5.5 3.5 1.3 0.2
## 38 4.9 3.6 1.4 0.1
## 39 4.4 3.0 1.3 0.2
## 40 5.1 3.4 1.5 0.2
## 41 5.0 3.5 1.3 0.3
## 42 4.5 2.3 1.3 0.3
## 43 4.4 3.2 1.3 0.2
## 44 5.0 3.5 1.6 0.6
## 45 5.1 3.8 1.9 0.4
## 46 4.8 3.0 1.4 0.3
## 47 5.1 3.8 1.6 0.2
## 48 4.6 3.2 1.4 0.2
## 49 5.3 3.7 1.5 0.2
## 50 5.0 3.3 1.4 0.2
## 51 7.0 3.2 4.7 1.4
## 52 6.4 3.2 4.5 1.5
## 53 6.9 3.1 4.9 1.5
## 54 5.5 2.3 4.0 1.3
## 55 6.5 2.8 4.6 1.5
## 56 5.7 2.8 4.5 1.3
## 57 6.3 3.3 4.7 1.6
## 58 4.9 2.4 3.3 1.0
## 59 6.6 2.9 4.6 1.3
## 60 5.2 2.7 3.9 1.4
## 61 5.0 2.0 3.5 1.0
## 62 5.9 3.0 4.2 1.5
## 63 6.0 2.2 4.0 1.0
## 64 6.1 2.9 4.7 1.4
## 65 5.6 2.9 3.6 1.3
## 66 6.7 3.1 4.4 1.4
## 67 5.6 3.0 4.5 1.5
## 68 5.8 2.7 4.1 1.0
## 69 6.2 2.2 4.5 1.5
## 70 5.6 2.5 3.9 1.1
## 71 5.9 3.2 4.8 1.8
## 72 6.1 2.8 4.0 1.3
## 73 6.3 2.5 4.9 1.5
## 74 6.1 2.8 4.7 1.2
## 75 6.4 2.9 4.3 1.3
## 76 6.6 3.0 4.4 1.4
## 77 6.8 2.8 4.8 1.4
## 78 6.7 3.0 5.0 1.7
## 79 6.0 2.9 4.5 1.5
## 80 5.7 2.6 3.5 1.0
## 81 5.5 2.4 3.8 1.1
## 82 5.5 2.4 3.7 1.0
## 83 5.8 2.7 3.9 1.2
## 84 6.0 2.7 5.1 1.6
## 85 5.4 3.0 4.5 1.5
## 86 6.0 3.4 4.5 1.6
## 87 6.7 3.1 4.7 1.5
## 88 6.3 2.3 4.4 1.3
## 89 5.6 3.0 4.1 1.3
## 90 5.5 2.5 4.0 1.3
## 91 5.5 2.6 4.4 1.2
## 92 6.1 3.0 4.6 1.4
## 93 5.8 2.6 4.0 1.2
## 94 5.0 2.3 3.3 1.0
## 95 5.6 2.7 4.2 1.3
## 96 5.7 3.0 4.2 1.2
## 97 5.7 2.9 4.2 1.3
## 98 6.2 2.9 4.3 1.3
## 99 5.1 2.5 3.0 1.1
## 100 5.7 2.8 4.1 1.3
## 101 6.3 3.3 6.0 2.5
## 102 5.8 2.7 5.1 1.9
## 103 7.1 3.0 5.9 2.1
## 104 6.3 2.9 5.6 1.8
## 105 6.5 3.0 5.8 2.2
## 106 7.6 3.0 6.6 2.1
## 107 4.9 2.5 4.5 1.7
## 108 7.3 2.9 6.3 1.8
## 109 6.7 2.5 5.8 1.8
## 110 7.2 3.6 6.1 2.5
## 111 6.5 3.2 5.1 2.0
## 112 6.4 2.7 5.3 1.9
## 113 6.8 3.0 5.5 2.1
## 114 5.7 2.5 5.0 2.0
## 115 5.8 2.8 5.1 2.4
## 116 6.4 3.2 5.3 2.3
## 117 6.5 3.0 5.5 1.8
## 118 7.7 3.8 6.7 2.2
## 119 7.7 2.6 6.9 2.3
## 120 6.0 2.2 5.0 1.5
## 121 6.9 3.2 5.7 2.3
## 122 5.6 2.8 4.9 2.0
## 123 7.7 2.8 6.7 2.0
## 124 6.3 2.7 4.9 1.8
## 125 6.7 3.3 5.7 2.1
## 126 7.2 3.2 6.0 1.8
## 127 6.2 2.8 4.8 1.8
## 128 6.1 3.0 4.9 1.8
## 129 6.4 2.8 5.6 2.1
## 130 7.2 3.0 5.8 1.6
## 131 7.4 2.8 6.1 1.9
## 132 7.9 3.8 6.4 2.0
## 133 6.4 2.8 5.6 2.2
## 134 6.3 2.8 5.1 1.5
## 135 6.1 2.6 5.6 1.4
## 136 7.7 3.0 6.1 2.3
## 137 6.3 3.4 5.6 2.4
## 138 6.4 3.1 5.5 1.8
## 139 6.0 3.0 4.8 1.8
## 140 6.9 3.1 5.4 2.1
## 141 6.7 3.1 5.6 2.4
## 142 6.9 3.1 5.1 2.3
## 143 5.8 2.7 5.1 1.9
## 144 6.8 3.2 5.9 2.3
## 145 6.7 3.3 5.7 2.5
## 146 6.7 3.0 5.2 2.3
## 147 6.3 2.5 5.0 1.9
## 148 6.5 3.0 5.2 2.0
## 149 6.2 3.4 5.4 2.3
## 150 5.9 3.0 5.1 1.8
Ok we can select columns cool, can we do something with them - yes with mutate which allows one to create new columns or operate on columns based on values of other columns
iris_select <- iris %>%
select(Species, Petal.Length, Petal.Width) %>%
mutate(Times.Bigger = (Petal.Length/Petal.Width)) # Create new column, set = to condition/function
iris_select <- iris %>%
select(Species, Petal.Length, Petal.Width) %>%
mutate(Times.Bigger = (Petal.Length/Petal.Width),
Log.Length = log(Petal.Length+1))
head(iris_select)
## Species Petal.Length Petal.Width Times.Bigger Log.Length
## 1 setosa 1.4 0.2 7.00 0.8754687
## 2 setosa 1.4 0.2 7.00 0.8754687
## 3 setosa 1.3 0.2 6.50 0.8329091
## 4 setosa 1.5 0.2 7.50 0.9162907
## 5 setosa 1.4 0.2 7.00 0.8754687
## 6 setosa 1.7 0.4 4.25 0.9932518
# BASE R equivalent
iris_copy <- iris
iris_copy$Time.Bigger = iris_copy$Petal.Length / iris_copy$Petal.Width
head(iris_copy)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Time.Bigger
## 1 5.1 3.5 1.4 0.2 setosa 7.00
## 2 4.9 3.0 1.4 0.2 setosa 7.00
## 3 4.7 3.2 1.3 0.2 setosa 6.50
## 4 4.6 3.1 1.5 0.2 setosa 7.50
## 5 5.0 3.6 1.4 0.2 setosa 7.00
## 6 5.4 3.9 1.7 0.4 setosa 4.25
How can we work on specific values - filter, we can condition on column values, e.g., grab only setosa species or only species above certain length
setosa_small <- iris %>% # create from iris dataset
filter(Species == "setosa" & Sepal.Length < 5) # filter for species name and sepal length under 5
filter(iris, Petal.Length > 6 & Sepal.Length > 7)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.6 3.0 6.6 2.1 virginica
## 2 7.3 2.9 6.3 1.8 virginica
## 3 7.2 3.6 6.1 2.5 virginica
## 4 7.7 3.8 6.7 2.2 virginica
## 5 7.7 2.6 6.9 2.3 virginica
## 6 7.7 2.8 6.7 2.0 virginica
## 7 7.4 2.8 6.1 1.9 virginica
## 8 7.9 3.8 6.4 2.0 virginica
## 9 7.7 3.0 6.1 2.3 virginica
head(setosa_small) # view the new dataset
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.9 3.0 1.4 0.2 setosa
## 2 4.7 3.2 1.3 0.2 setosa
## 3 4.6 3.1 1.5 0.2 setosa
## 4 4.6 3.4 1.4 0.3 setosa
## 5 4.4 2.9 1.4 0.2 setosa
## 6 4.9 3.1 1.5 0.1 setosa
# we can actually combine the filter with mutate which is often usefule
iris_flagged <- iris %>% mutate(TallPlants = Sepal.Length > 5.8 | Petal.Length > 3.5)
head(iris_flagged %>% filter(Sepal.Length > 5.8 | Petal.Length > 3.5))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species TallPlants
## 1 7.0 3.2 4.7 1.4 versicolor TRUE
## 2 6.4 3.2 4.5 1.5 versicolor TRUE
## 3 6.9 3.1 4.9 1.5 versicolor TRUE
## 4 5.5 2.3 4.0 1.3 versicolor TRUE
## 5 6.5 2.8 4.6 1.5 versicolor TRUE
## 6 5.7 2.8 4.5 1.3 versicolor TRUE
# One can use case_when to have not simply TRUE, FALSE values which can be useful when creating new variables
# case when is like a vectorised if_else statements
iris_flagged <- iris %>% mutate(TallPlants =
case_when(Sepal.Length > 5.8 | Petal.Length > 3.5 ~ "Tall",
TRUE ~ "Short"))
## case_when can be used with multiple conditions and can be thought of as
# if (condition) {
# do x
#} else { do y}
iris_flagged <- iris %>% mutate(TallPlants =
case_when(Sepal.Length > 5.8 | Petal.Length > 3.5 ~ "Tall",
Sepal.Length < 5.8 & Sepal.Length > 5 ~ "Medium",
TRUE ~ "Short"))
summrise is similar to psych::describe() we saw above in basic r you can use aggregate the basic idea is you can create a new dataframe/ tibble that describes the data, although with the current data this may not appear very useful, it can be very helpful when you have multi-level data, e.g., multile trials, from multiple subjects, we will see this later on in its simplest form it just provides the mean of a column
iris %>% summarise(mean(Sepal.Width))
## mean(Sepal.Width)
## 1 3.057333
iris %>% summarise(mean(Sepal.Width), n = n()) # how many observations
## mean(Sepal.Width) n
## 1 3.057333 150
iris %>% summarise(mean(Sepal.Width), max(Sepal.Width), median(Sepal.Width),n = n())
## mean(Sepal.Width) max(Sepal.Width) median(Sepal.Width) n
## 1 3.057333 4.4 3 150
# base R similar way
aggregate(Sepal.Width~1, data = iris,mean)
## Sepal.Width
## 1 3.057333
agg_Df<- aggregate(Sepal.Width~1, data = iris,FUN = function(x) c(mean = mean(x), sd = sd(x)))
head(agg_Df)
## Sepal.Width.mean Sepal.Width.sd
## 1 3.0573333 0.4358663
# some people use apply which applies a function to each element of data.frame - apply(iris[,1:4],2,mean) # 2 specifies that it should be applied over columns
# a benefit of summarise is that we can use group_by to calculate descripives per group
#The group_by() function allows us to break up our dataset by a specific value. Above we wanted to see the mean, max, and min values of sepa width by species. We can use group_by(Species) for this to work.
iris %>% group_by(Species) %>% summarise(mean(Sepal.Width), max(Sepal.Width), median(Sepal.Width),n = n())
## # A tibble: 3 × 5
## Species `mean(Sepal.Width)` `max(Sepal.Width)` `median(Sepal.Width)` n
## <fct> <dbl> <dbl> <dbl> <int>
## 1 setosa 3.43 4.4 3.4 50
## 2 versicolor 2.77 3.4 2.8 50
## 3 virginica 2.97 3.8 3 50
# we can also arrange data
#
# arrange() allows us to rearrange our data based on certain conditions. Let’s create a new dataset called virginica that contains only the virginica species, has sepal widths greater than 2.9, and is arranged in descending sepal lengths (we will use the desc() function).
#
# NOTE: desc() stands for descending. It will places the rows in descending order.
iris_new <- iris %>% # call from iris data
filter(Species == "virginica" & Sepal.Width > 2.9) %>% # filter species and sepal width
arrange(desc(Sepal.Length)) # arrange the data by descending values in sepal length
iris_new # view the new data frame
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.9 3.8 6.4 2.0 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 3.0 6.1 2.3 virginica
## 4 7.6 3.0 6.6 2.1 virginica
## 5 7.2 3.6 6.1 2.5 virginica
## 6 7.2 3.2 6.0 1.8 virginica
## 7 7.2 3.0 5.8 1.6 virginica
## 8 7.1 3.0 5.9 2.1 virginica
## 9 6.9 3.2 5.7 2.3 virginica
## 10 6.9 3.1 5.4 2.1 virginica
## 11 6.9 3.1 5.1 2.3 virginica
## 12 6.8 3.0 5.5 2.1 virginica
## 13 6.8 3.2 5.9 2.3 virginica
## 14 6.7 3.3 5.7 2.1 virginica
## 15 6.7 3.1 5.6 2.4 virginica
## 16 6.7 3.3 5.7 2.5 virginica
## 17 6.7 3.0 5.2 2.3 virginica
## 18 6.5 3.0 5.8 2.2 virginica
## 19 6.5 3.2 5.1 2.0 virginica
## 20 6.5 3.0 5.5 1.8 virginica
## 21 6.5 3.0 5.2 2.0 virginica
## 22 6.4 3.2 5.3 2.3 virginica
## 23 6.4 3.1 5.5 1.8 virginica
## 24 6.3 3.3 6.0 2.5 virginica
## 25 6.3 3.4 5.6 2.4 virginica
## 26 6.2 3.4 5.4 2.3 virginica
## 27 6.1 3.0 4.9 1.8 virginica
## 28 6.0 3.0 4.8 1.8 virginica
## 29 5.9 3.0 5.1 1.8 virginica
psych::describeBy(iris, group="Species")
##
## Descriptive statistics by group
## Species: setosa
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Sepal.Length 1 50 5.01 0.35 5.0 5.00 0.30 4.3 5.8 1.5 0.11 -0.45 0.05
## Sepal.Width 2 50 3.43 0.38 3.4 3.42 0.37 2.3 4.4 2.1 0.04 0.60 0.05
## Petal.Length 3 50 1.46 0.17 1.5 1.46 0.15 1.0 1.9 0.9 0.10 0.65 0.02
## Petal.Width 4 50 0.25 0.11 0.2 0.24 0.00 0.1 0.6 0.5 1.18 1.26 0.01
## Species* 5 50 1.00 0.00 1.0 1.00 0.00 1.0 1.0 0.0 NaN NaN 0.00
## ----------------------------------------------------------------------------------------------
## Species: versicolor
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Sepal.Length 1 50 5.94 0.52 5.90 5.94 0.52 4.9 7.0 2.1 0.10 -0.69 0.07
## Sepal.Width 2 50 2.77 0.31 2.80 2.78 0.30 2.0 3.4 1.4 -0.34 -0.55 0.04
## Petal.Length 3 50 4.26 0.47 4.35 4.29 0.52 3.0 5.1 2.1 -0.57 -0.19 0.07
## Petal.Width 4 50 1.33 0.20 1.30 1.32 0.22 1.0 1.8 0.8 -0.03 -0.59 0.03
## Species* 5 50 2.00 0.00 2.00 2.00 0.00 2.0 2.0 0.0 NaN NaN 0.00
## ----------------------------------------------------------------------------------------------
## Species: virginica
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Sepal.Length 1 50 6.59 0.64 6.50 6.57 0.59 4.9 7.9 3.0 0.11 -0.20 0.09
## Sepal.Width 2 50 2.97 0.32 3.00 2.96 0.30 2.2 3.8 1.6 0.34 0.38 0.05
## Petal.Length 3 50 5.55 0.55 5.55 5.51 0.67 4.5 6.9 2.4 0.52 -0.37 0.08
## Petal.Width 4 50 2.03 0.27 2.00 2.03 0.30 1.4 2.5 1.1 -0.12 -0.75 0.04
## Species* 5 50 3.00 0.00 3.00 3.00 0.00 3.0 3.0 0.0 NaN NaN 0.00
We often want to rename columns - rename / rename_with; or if we want to change the names of values within column - recode
print(names(iris_flagged))
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" "TallPlants"
iris_flagged = iris_flagged %>% rename(Height := TallPlants)
print(names(iris_flagged))
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" "Height"
iris %>%
rename_with(~ tolower(gsub("\\.", "_", .x)), .cols = everything())
## sepal_length sepal_width petal_length petal_width species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
# function(.x) {
# tolower(gsub("\\.", "_", .x))
# }
# with ~ we are creating a function but not asigning it to a variable - another short hand
# https://coolbutuseless.github.io/2019/03/13/anonymous-functions-in-r-part-1/
# NOTE although the original data had name Species, now because of tolower we use species
iris_clean = iris %>%
rename_with(~ tolower(gsub("\\.", "_", .x)), .cols = everything()) %>%
mutate(species_new = recode(species,
"setosa" = "A",
"versicolor" = "B",
"virginica"= "C"))
head(iris_clean)
## sepal_length sepal_width petal_length petal_width species species_new
## 1 5.1 3.5 1.4 0.2 setosa A
## 2 4.9 3.0 1.4 0.2 setosa A
## 3 4.7 3.2 1.3 0.2 setosa A
## 4 4.6 3.1 1.5 0.2 setosa A
## 5 5.0 3.6 1.4 0.2 setosa A
## 6 5.4 3.9 1.7 0.4 setosa A
plot(iris) # not pretty but quick it can quickly select the plot you might want e.g., here does pairs plot or a scatter plot for each pair of variables
hist(iris$Sepal.Length)
boxplot(iris$Sepal.Length ~ iris$Species) # first y axis then x axis # can check for outliers
# lattice::dotplot(iris$Sepal.Length ~ iris$Species)
plot(iris$Sepal.Length ~ iris$Sepal.Width)
Visualization - the plotting can be done with basic plots but most
people use the ggplot library which can create a bit prettier plots
“ggplot2 is designed to work iteratively. You start with a layer that shows the raw data. Then you add layers of annotations and statistical summaries. This allows you to produce graphics using the same structured thinking that you would use to design an analysis. This reduces the distance between the plot in your head and the one on the page. This is especially helpful for students who have not yet developed the structured approach to analysis used by experts.”
ggplot2 book - https://ggplot2-book.org/getting-started.html
A plot in ggplot2 is described in three parts:
Aesthetics: Relationship between data and visual properties that define working space of the plot (which variables map on individual axes, color, size, fill, etc.). Geometrical primitives that visualize your data (points, lines, error bars, etc.) that are added to the plot. Other properties of the plot (scaling of axes, labels, annotations, etc.) that are added to the plot.
Every ggplot2 plot has three key components:
data,
A set of aesthetic mappings between variables in the data and visual properties, and
At least one layer which describes how to render each observation. Layers are usually created with a geom function.
ggplot2 Main Logic ggplot2 follows a layered grammar of graphics:
Start with data + aesthetics: ggplot(data, aes(x = …, y = …))
Add geoms (visual elements): + geom_point(), geom_line(), geom_bar() Map additional aesthetics (color, shape, size):
aes(color = …, shape = …) Customize scales and labels:
scale_color_manual(), labs(), theme_minimal()
Think of it as building a plot step-by-step, like layers on a cake.
library(ggplot2)
library(patchwork)
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, col = Species)) + geom_point()
# notice how the data argument is not needed
# iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length, col = Species)) + geom_point()
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, col=Species)) +
geom_point() +
# a linear regression over all dots in the group
geom_smooth(method="lm", formula = y ~ x, se=TRUE, linetype="dashed")
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, col=Species)) +
geom_point() +
# Group-specific regressions
geom_smooth(method = "lm", formula = y ~ x, se=TRUE, linetype="dashed") +
# Global regression overlay
geom_smooth(
aes(group = 1), # 👈 tells ggplot to treat all data as one group
method = "lm",
formula = y ~ x,
se = TRUE,
color = "black", # make it stand out (or whatever color you want)
linewidth = 1.2,
linetype = "solid" # solid line for global fit
)
## what if we want each Species to be a separate plot
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, col=Species)) +
geom_point() +
geom_smooth(
method = "lm",
formula = y ~ x,
se = TRUE,
) + facet_grid(. ~ Species) + # makes a separate subplot for each group
theme(legend.position = "none") # we don't need the legend as conditions are split between facets
# Grab default ggplot colors and apply alpha
# default_colors <- scales::hue_pal()(3) # get 3 default colors
# transparent_colors <- alpha(default_colors, 0.5)
#
# ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
# geom_point() +
# geom_smooth(method = "lm", formula = y ~ x, se = FALSE, linetype = "dashed") +
# scale_color_manual(values = transparent_colors) +
# theme_minimal()
Often we need more than lines on a grid, so how do we change labels or specify certain colors for groups of variables
iris_flagged <- iris %>% mutate(TallPlants =
case_when(Sepal.Length > 5.8 | Petal.Length > 3.5 ~ "Tall",
Sepal.Length < 5.8 & Sepal.Length > 5 ~ "Medium",
TRUE ~ "Short"))
ggplot(iris_flagged, aes(x = Petal.Length, y = Petal.Width)) +
geom_point(aes(color = TallPlants, shape = TallPlants), size = 3) +
labs(
title = "Iris: Tall, Medium, and Short Plants",
x = "Petal Length",
y = "Petal Width",
color = "Plant Height",
shape = "Plant Height"
) +
theme_minimal()
ggplot(iris_flagged, aes(x = Petal.Length, y = Petal.Width)) +
geom_point(aes(color = TallPlants, shape = TallPlants), size = 3) + scale_color_manual(values = c("Tall" = "red", "Medium" = "orange", "Short" = "gray")) +
labs(
title = "Iris: Tall, Medium, and Short Plants",
x = "Petal Length",
y = "Petal Width",
color = "Plant Height",
shape = "Plant Height"
) +
theme_minimal()
iris %>% ggplot(aes(x= Petal.Length, y = Petal.Width)) +geom_point(color="blue")
iris %>% ggplot(aes(Petal.Length)) + geom_histogram() + theme_minimal() # lets remove the annoying grid
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
P<- iris %>% ggplot(aes(Petal.Length)) + geom_histogram(fill="skyblue", color = "black", bins = 30) + theme_minimal() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # lets remove the annoying grid
theme(text = element_text(size = 14)) + # increase the text size
labs(title = "Distribution of Petal Length in Iris Dataset",
x = "Petal Length (cm)",
y = "Count")
P
ggsave('./Histogram_Petal_Length.png', P, width = 15, height = 8, dpi = 300)
## ok but often we might want more control over text elements
iris %>% ggplot(aes(Petal.Length)) + geom_histogram(fill="skyblue", color = "black", bins = 30) + theme_minimal() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # lets remove the annoying grid
theme(
plot.title = element_text(size = 20, face = "bold", hjust = 0.5), # center title and make it bold and tell it explicitly what size it should be
axis.title.x = element_text(size = 14), # xlabel
axis.title.y = element_text(size = 18), # ylabel
axis.text.y = element_text(size = 12), # yticklabels
axis.text.x = element_text(size = 10), # xticklabels
text = element_text(family = "serif") # applies to all labels by default
) +
labs(title = "Distribution of Petal Length in Iris Dataset",
x = "Petal Length (cm)",
y = "Count")
## Use above theme object to manipulate your figures as you wish
## what if we want to examine histogram grouped by species
iris %>% ggplot(aes(Petal.Length, fill = Species)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
https://rstudio.github.io/cheatsheets/html/data-visualization.html
library(ggExtra)
g<- ggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +
geom_point() +
# a linear regression over all dots in the group
geom_smooth(method="lm", formula = y ~ x, se=TRUE, linetype="dashed")
ggMarginal(g, type = "histogram", fill="transparent")
library(corrplot)
## corrplot 0.84 loaded
iris %>% select(where(is.numeric)) %>% cor() %>% corrplot(, type='lower',diag = FALSE, method='number', col = colorRampPalette(c("blue", "white", "red"))(200))
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
iris %>% select(where(is.numeric)) %>% rename(
S_length = Sepal.Length,
S_width = Sepal.Width,
P_length = Petal.Length,
P_width = Petal.Width
)%>% ggpairs( lower = list(continuous = "points", combo = "dot_no_facet"),progress=FALSE,title='Iris') + theme_classic(base_size=15)
# Keep Species for mapping but exclude from actual pairs
iris_data <- iris %>%
select(Species, where(is.numeric)) %>%
rename(
S_length = Sepal.Length,
S_width = Sepal.Width,
P_length = Petal.Length,
P_width = Petal.Width
)
ggpairs( iris_data,
columns = 2:5, # 👈 skip the first column (Species)
mapping = aes(color = Species),
lower = list(continuous = "points", combo = "dot_no_facet"),
progress = FALSE,
title = "Iris"
) +
theme_classic(base_size = 15)
Ok it is unlikely one would master all data-wrangling needed - ever, because each project would require a different set of options, Google and chatGPT is your friend. Often you can do everything in base R or tidyverse equally well it is just a style preference. Just like learning a normal language, it does not make sense to learn the vocabulary if you do not have anything to say, so the best way to learn is to start using the skills to analyse data
the simplest thing we can do is do a t-test, lets compare the petal length of the setosa and virginica
iris_t_Test_data <- iris %>% select(Species,Petal.Length) %>% filter(Species %in% c('setosa', 'virginica'))
#
t.test(Petal.Length ~ Species, data= iris_t_Test_data, var.equal=TRUE)# note
##
## Two Sample t-test
##
## data: Petal.Length by Species
## t = -49.986, df = 98, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group setosa and group virginica is not equal to 0
## 95 percent confidence interval:
## -4.252374 -3.927626
## sample estimates:
## mean in group setosa mean in group virginica
## 1.462 5.552
var.test(Petal.Length ~ Species, data= iris_t_Test_data, var.equal=TRUE)# note
##
## F test to compare two variances
##
## data: Petal.Length by Species
## F = 0.099016, num df = 49, denom df = 49, p-value = 1.875e-13
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.05618945 0.17448557
## sample estimates:
## ratio of variances
## 0.0990164
ggplot(data = iris_t_Test_data, aes(x= Species, y = Petal.Length)) + geom_boxplot() + theme_minimal()
ggplot(data = iris_t_Test_data, aes(x= Petal.Length, fill = Species)) + geom_histogram() + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_linear <- lm(Petal.Length ~ Species, data= iris_t_Test_data)
summary(model_linear)
##
## Call:
## lm(formula = Petal.Length ~ Species, data = iris_t_Test_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0520 -0.1620 0.0380 0.1405 1.3480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.46200 0.05786 25.27 <2e-16 ***
## Speciesvirginica 4.09000 0.08182 49.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4091 on 98 degrees of freedom
## Multiple R-squared: 0.9623, Adjusted R-squared: 0.9619
## F-statistic: 2499 on 1 and 98 DF, p-value: < 2.2e-16
anova in base R - https://bookdown.org/steve_midway/DAR/understanding-anova-in-r.html https://statsandr.com/blog/anova-in-r/ Remember it expects that the predictor IV is a Factor
contrasts(iris$Species) <- "contr.treatment"
lm_model <- lm(Petal.Length ~ Species, data = iris)
summary(lm_model)
##
## Call:
## lm(formula = Petal.Length ~ Species, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.260 -0.258 0.038 0.240 1.348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.46200 0.06086 24.02 <2e-16 ***
## Speciesversicolor 2.79800 0.08607 32.51 <2e-16 ***
## Speciesvirginica 4.09000 0.08607 47.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4303 on 147 degrees of freedom
## Multiple R-squared: 0.9414, Adjusted R-squared: 0.9406
## F-statistic: 1180 on 2 and 147 DF, p-value: < 2.2e-16
contrasts(iris$Species)
## versicolor virginica
## setosa 0 0
## versicolor 1 0
## virginica 0 1
# One way anova
one.way <- aov(Petal.Length ~ Species, data = iris)
print(one.way$coefficients)
## (Intercept) Speciesversicolor Speciesvirginica
## 1.462 2.798 4.090
summary(one.way)
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 437.1 218.55 1180 <2e-16 ***
## Residuals 147 27.2 0.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#one_way_Welch<- oneway.test(Petal.Length ~ Species, data = iris)
#OneWay_Kruskal<- kruskal.test(Petal.Length ~ Species, data = iris)
#plot(one.way)
print(TukeyHSD(one.way))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Petal.Length ~ Species, data = iris)
##
## $Species
## diff lwr upr p adj
## versicolor-setosa 2.798 2.59422 3.00178 0
## virginica-setosa 4.090 3.88622 4.29378 0
## virginica-versicolor 1.292 1.08822 1.49578 0
# library(multcomp)
# summary(glht(res.aov, linfct = mcp(group = "Tukey")))
# see https://www.sthda.com/english/wiki/one-way-anova-test-in-r
# pairwise.t.test(my_data$weight, my_data$group, p.adjust.method = "BH")
# anova(lm_model)
# options(contrasts = c("contr.sum", "contr.poly"))
# For sum-to-zero (deviation) coding
contrasts(iris$Species) <- "contr.sum"
# contrasts(iris$Species) <- "contr.helmert"
(contrasts(iris$Species) )
## [,1] [,2]
## setosa 1 0
## versicolor 0 1
## virginica -1 -1
lm_sum<- lm(Petal.Length ~ Species, data= iris)
summary(lm_sum)
##
## Call:
## lm(formula = Petal.Length ~ Species, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.260 -0.258 0.038 0.240 1.348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.75800 0.03514 106.95 <2e-16 ***
## Species1 -2.29600 0.04969 -46.21 <2e-16 ***
## Species2 0.50200 0.04969 10.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4303 on 147 degrees of freedom
## Multiple R-squared: 0.9414, Adjusted R-squared: 0.9406
## F-statistic: 1180 on 2 and 147 DF, p-value: < 2.2e-16
Grand mean: 3.758
Setosa’s mean: 3.758 + (-2.296) = 1.462
Versicolor’s mean: 3.758 + 0.502 = 4.26
Virginica’s mean: 3.758 + 1.794 = 5.552
these re-parametrisations of models are explained well here - https://www.youtube.com/playlist?list=PLB2iAtgpI4YHUJcyXnaoR3sWKSdhYDtjD
ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
stat_summary(fun = mean, geom = "bar", color = "black", width = 0.6) +
stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.2) +
theme_classic(base_size = 15)
see for options - https://www.sthda.com/english/articles/index.php?url=/24-ggpubr-publication-ready-plots/76-add-p-values-and-significance-levels-to-ggplots/
library(ggpubr)
# Perorm pairwise comparisons
compare_means(Petal.Length ~ Species, data = iris, paired = FALSE, effectsize = TRUE, method="t.test")
## # A tibble: 3 × 8
## .y. group1 group2 p p.adj p.format p.signif method
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 Petal.Length setosa versicolor 9.93e-46 2 e-45 <2e-16 **** T-test
## 2 Petal.Length setosa virginica 9.27e-50 2.8e-49 <2e-16 **** T-test
## 3 Petal.Length versicolor virginica 4.90e-22 4.9e-22 <2e-16 **** T-test
my_comparisons <- list(c('setosa','versicolor'), c('setosa','virginica'), c('versicolor','virginica'))
ggboxplot(iris, x = "Species", y = "Petal.Length") +
stat_compare_means(comparisons = my_comparisons, method = "t.test", label = "p.signif") +
stat_compare_means(method = "anova",label.y = 9)
There are other packages for anovas like easyanova and ezANOVA, but often one can also do a lm model like above as long as consideration is taken about the contrast coding. e.g., lm(Y ~ Factor1*Factor2) - would give you main effect of factor 1, main effect of factor 2 and their interaction.
Regression model
rm(list=ls()) # cleans environment
#library(ggplot2)
library(tidyverse)
url<- 'https://osf.io/download/kbd84/'
# https://doi.org/10.1016/j.neurobiolaging.2023.11.009
library(readr)
data_df <- read_csv(url)
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## CCID = col_character(),
## age = col_double(),
## Sex = col_double(),
## CattellPCA = col_double(),
## Memory_700 = col_double(),
## Mov_SyS = col_double(),
## Rest_SyS = col_double(),
## SMT_SyS = col_double(),
## TIV = col_double(),
## Education = col_double()
## )
head(data_df)
## # A tibble: 6 × 10
## CCID age Sex CattellPCA Memory_700 Mov_SyS Rest_SyS SMT_SyS TIV Education
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CC110033 24 1 21.1 23.7 0.691 0.704 0.684 1.37 3
## 2 CC110037 18 1 15.3 20.8 0.657 0.702 0.715 1.32 2
## 3 CC110045 24 2 20.6 23.6 0.729 0.725 0.735 1.35 3
## 4 CC110056 22 2 12.5 21.5 0.568 0.595 0.698 1.25 3
## 5 CC110062 20 1 NaN 22.1 NaN NaN NaN 1.55 2
## 6 CC110069 28 2 20.7 30.7 0.651 0.657 0.711 1.36 3
This is real data before you continue you can try to clean it a bit 1) Remove rows where there are NaN values for the main variables we would focus on - CattellPCA,Memory_700, Mov_SyS,Rest_SyS,SMT_SyS 2) Create a new column that is has sex recoded as -1 and 1, and make Sex a factor with 1 - being Male, 2 being Female 3) Plot how Mov_SyS differs across age 4) Fit a linear model predicting Mov_SyS with ageSex 5) Fit a model predicting CattellPCA ~ SyS + ageSex 6) Plot partial regression to understand the model
Solution
df_Schaefer <- data_df %>% filter(!is.na(CattellPCA),
!is.na(Memory_700),
!is.na(Mov_SyS),
!is.na(Rest_SyS),
!is.na(SMT_SyS)) %>%
mutate(Sex = factor(Sex, labels = c("Male","Female")),
SexC = case_when(
Sex == "Male" ~ -1,
Sex == "Female"~ 1,
TRUE ~ NA_real_
))
head(df_Schaefer)
## # A tibble: 6 × 11
## CCID age Sex CattellPCA Memory_700 Mov_SyS Rest_SyS SMT_SyS TIV Education SexC
## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CC110033 24 Male 21.1 23.7 0.691 0.704 0.684 1.37 3 -1
## 2 CC110037 18 Male 15.3 20.8 0.657 0.702 0.715 1.32 2 -1
## 3 CC110045 24 Female 20.6 23.6 0.729 0.725 0.735 1.35 3 1
## 4 CC110056 22 Female 12.5 21.5 0.568 0.595 0.698 1.25 3 1
## 5 CC110069 28 Female 20.7 30.7 0.651 0.657 0.711 1.36 3 1
## 6 CC110087 28 Female 18.2 22.2 0.675 0.652 0.700 1.25 3 1
Needed later on optional but one can use a polynomial expansion of age to account for curved relationship between age and SyS - sometimes base R is easier than tidyverse
df_Schaefer$ageQuad <- poly(df_Schaefer$age, 2)
df_Schaefer$agepoly_1 <- df_Schaefer$ageQuad[,1]
df_Schaefer$agepoly_2 <- df_Schaefer$ageQuad[,2]
df_Schaefer$agepoly_1z <- datawizard::standardise(df_Schaefer$agepoly_1)
df_Schaefer$agepoly_2z <- datawizard::standardise(df_Schaefer$agepoly_2)
### MOVIE
p <- ggplot(df_Schaefer, aes(x = age, y = Mov_SyS ))
p <- p + geom_point(shape = 21, size = 3, colour = "black", fill = "white", stroke = 2)
p <- p + stat_smooth(method = "lm", se = TRUE, fill = "grey60", formula = y ~ poly(x,2, raw = TRUE), colour = "springgreen3", linewidth = 3)
#formatting
p <- p + scale_x_continuous(breaks = round(seq(20, max(80), by = 20),1),limits = c(15,90)) + #ylim(0,400) +
theme_bw() + theme(panel.border = element_blank(), legend.position = "none",text = element_text(size=18),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1.5),axis.ticks = element_blank()) + labs(x='Age',y='Functional Segregation',title='Movie: System Segregation and Age') +theme(plot.title = element_text(hjust = 0.5))
p
# Movie
lm_model_Movie <- lm(Mov_SyS ~ agepoly_1z + agepoly_2z + SexC + agepoly_1z:SexC + agepoly_2z:SexC,
data = df_Schaefer);
summary(lm_model_Movie)
##
## Call:
## lm(formula = Mov_SyS ~ agepoly_1z + agepoly_2z + SexC + agepoly_1z:SexC +
## agepoly_2z:SexC, data = df_Schaefer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31635 -0.02377 0.00645 0.02964 0.12212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6163288 0.0019974 308.564 < 2e-16 ***
## agepoly_1z -0.0454595 0.0019999 -22.731 < 2e-16 ***
## agepoly_2z -0.0108376 0.0019993 -5.421 8.5e-08 ***
## SexC 0.0015817 0.0019974 0.792 0.429
## agepoly_1z:SexC -0.0003821 0.0019999 -0.191 0.849
## agepoly_2z:SexC -0.0008270 0.0019993 -0.414 0.679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04999 on 621 degrees of freedom
## Multiple R-squared: 0.4689, Adjusted R-squared: 0.4646
## F-statistic: 109.7 on 5 and 621 DF, p-value: < 2.2e-16
library(heplots)
etasq(lm_model_Movie,anova=FALSE,partial = TRUE)
## Partial eta^2
## agepoly_1z 4.542285e-01
## agepoly_2z 4.527170e-02
## SexC 1.009443e-03
## agepoly_1z:SexC 5.877863e-05
## agepoly_2z:SexC 2.754844e-04
## Residuals NA
car::linearHypothesis(lm_model_Movie,c('agepoly_1z=0','agepoly_2z=0'))
##
## Linear hypothesis test:
## agepoly_1z = 0
## agepoly_2z = 0
##
## Model 1: restricted model
## Model 2: Mov_SyS ~ agepoly_1z + agepoly_2z + SexC + agepoly_1z:SexC +
## agepoly_2z:SexC
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 623 2.9175
## 2 621 1.5521 2 1.3654 273.16 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# MOVIE
lm_model_M <- lm(CattellPCA ~ scale(Mov_SyS)*agepoly_1z*SexC + scale(Mov_SyS)*agepoly_2z*SexC,
data = df_Schaefer);
summary(lm_model_M)
##
## Call:
## lm(formula = CattellPCA ~ scale(Mov_SyS) * agepoly_1z * SexC +
## scale(Mov_SyS) * agepoly_2z * SexC, data = df_Schaefer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0098 -1.5922 0.1145 1.7964 6.0411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.04076 0.14161 113.277 < 2e-16 ***
## scale(Mov_SyS) 0.60170 0.15441 3.897 0.000108 ***
## agepoly_1z -1.93255 0.15815 -12.219 < 2e-16 ***
## SexC -0.27822 0.14161 -1.965 0.049893 *
## agepoly_2z -0.50527 0.13203 -3.827 0.000143 ***
## scale(Mov_SyS):agepoly_1z -0.13875 0.16115 -0.861 0.389574
## scale(Mov_SyS):SexC -0.18044 0.15441 -1.169 0.243013
## agepoly_1z:SexC -0.10541 0.15815 -0.667 0.505335
## scale(Mov_SyS):agepoly_2z -0.10384 0.11965 -0.868 0.385799
## SexC:agepoly_2z -0.01222 0.13203 -0.093 0.926300
## scale(Mov_SyS):agepoly_1z:SexC -0.03240 0.16115 -0.201 0.840714
## scale(Mov_SyS):SexC:agepoly_2z -0.04806 0.11965 -0.402 0.688073
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.45 on 615 degrees of freedom
## Multiple R-squared: 0.4859, Adjusted R-squared: 0.4767
## F-statistic: 52.84 on 11 and 615 DF, p-value: < 2.2e-16
etasq(lm_model_M)
## Partial eta^2
## scale(Mov_SyS) 2.541466e-02
## agepoly_1z 2.281567e-01
## SexC 1.040679e-02
## agepoly_2z 2.746409e-02
## scale(Mov_SyS):agepoly_1z 1.160089e-03
## scale(Mov_SyS):SexC 3.545192e-03
## agepoly_1z:SexC 1.034644e-03
## scale(Mov_SyS):agepoly_2z 1.338773e-03
## SexC:agepoly_2z 6.150349e-05
## scale(Mov_SyS):agepoly_1z:SexC 6.573073e-05
## scale(Mov_SyS):SexC:agepoly_2z 2.622582e-04
## Residuals NA
library(visreg)
visreg(lm_model_M)
## Warning: Note that you are attempting to plot a 'main effect' in a model that contains an
## interaction. This is potentially misleading; you may wish to consider using the 'by'
## argument.
## Conditions used in construction of plot
## agepoly_1z: 0.0322679
## SexC: -1
## agepoly_2z: -0.3346304
## Warning: Note that you are attempting to plot a 'main effect' in a model that contains an
## interaction. This is potentially misleading; you may wish to consider using the 'by'
## argument.
## Conditions used in construction of plot
## Mov_SyS: 0.6346401
## SexC: -1
## agepoly_2z: -0.3346304
## Warning: Note that you are attempting to plot a 'main effect' in a model that contains an
## interaction. This is potentially misleading; you may wish to consider using the 'by'
## argument.
## Conditions used in construction of plot
## Mov_SyS: 0.6346401
## agepoly_1z: 0.0322679
## agepoly_2z: -0.3346304
## Warning: Note that you are attempting to plot a 'main effect' in a model that contains an
## interaction. This is potentially misleading; you may wish to consider using the 'by'
## argument.
## Conditions used in construction of plot
## Mov_SyS: 0.6346401
## agepoly_1z: 0.0322679
## SexC: -1
Ok what if we want a mixed effect model you can watch this series for a bit of theory - https://www.youtube.com/watch?v=5D5zEA29tu4&list=PLB2iAtgpI4YEAUiEQ1ZnfMXY-yewNzn9z&index=3&t=110s You can think of them as fancy regression, but they can be a bit involved especially when dealing with non-normal data. Multiple free courses available online Main package for this is lme4
library(lme4)
head(sleepstudy)
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## 4 321.4398 3 308
## 5 356.8519 4 308
## 6 414.6901 5 308
library(lattice)
# often a nice quick way to see multi-level data
# 1. Simple scatterplot matrix with conditioning
xyplot(Reaction ~ Days | Subject, data = sleepstudy,
layout = c(4, 5),
type = c("p", "r"), # points + regression line
main = "Reaction Time by Days for Each Subject")
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) +
geom_point() +
facet_wrap(~Subject, ncol=9) +
scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
theme_minimal()
# 1. Basic scatter plot of Reaction vs. Days, colored by Subject
ggplot(sleepstudy, aes(x = Days, y = Reaction, color = Subject)) +
geom_point(show.legend = FALSE) +
theme_minimal(base_size = 15) +
labs(title = "Reaction Time by Days of Sleep Deprivation",
y = "Reaction Time (ms)")
# 2. Add individual linear trends (random slopes visualization!)
ggplot(sleepstudy, aes(x = Days, y = Reaction, group = Subject, color = Subject)) +
geom_point(show.legend = FALSE) +
geom_line() +
theme_minimal(base_size = 15) +
labs(title = "Individual Reaction Time Trajectories",
y = "Reaction Time (ms)")
# 3. Overall linear trend (fixed effect only) + individual lines
ggplot(sleepstudy, aes(x = Days, y = Reaction)) +
geom_point(aes(color = Subject)) +
geom_line(aes(group = Subject, color = Subject), alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "black", size = 1.2) +
theme_minimal(base_size = 15) +
labs(title = "Overall Trend and Individual Slopes",
y = "Reaction Time (ms)")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(sleepstudy, aes(x = Days, y = Reaction, color = Subject)) +
geom_point(alpha = 0.5, show.legend = FALSE) +
# Smooth line **per Subject**
geom_smooth(method = "lm", se = FALSE, size = 0.7, linetype = "dashed", show.legend = FALSE) +
# Overall smooth line (no grouping)
geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", size = 1.2) +
theme_minimal(base_size = 15) +
labs(title = "Per-Subject and Overall Trends",
y = "Reaction Time (ms)")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
model1 <- lm(Reaction ~ 1, sleepstudy) # intercept model
summary(model1)
##
## Call:
## lm(formula = Reaction ~ 1, data = sleepstudy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.176 -43.132 -9.857 38.244 167.846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 298.508 4.198 71.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.33 on 179 degrees of freedom
datafit=fitted(model1)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) +
geom_point() +
geom_line(aes(y=datafit), linetype=2) +
facet_wrap(~Subject, ncol=9) +
scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
theme_minimal()
model2 <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy)
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ 1 + (1 | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1904.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4983 -0.5501 -0.1476 0.5123 3.3446
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject (Intercept) 1278 35.75
## Residual 1959 44.26
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 298.51 9.05 32.98
datafit=fitted(model2)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) +
geom_point() +
geom_line(aes(y=datafit), linetype=2) +
facet_wrap(~Subject, ncol=9) +
scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
theme_minimal()
model3 <- lmer(Reaction ~ 1 + Days + (1 | Subject), sleepstudy)
summary(model3)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ 1 + Days + (1 | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1786.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2257 -0.5529 0.0109 0.5188 4.2506
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject (Intercept) 1378.2 37.12
## Residual 960.5 30.99
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 251.4051 9.7467 25.79
## Days 10.4673 0.8042 13.02
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.371
datafit=fitted(model3)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) +
geom_point() +
geom_line(aes(y=datafit), linetype=2) +
facet_wrap(~Subject, ncol=9) +
scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
theme_minimal()
model4 <- lmer(Reaction ~ 1 + Days + (1 + Days | Subject), sleepstudy)
summary(model4)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ 1 + Days + (1 + Days | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1743.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9536 -0.4634 0.0231 0.4634 5.1793
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Subject (Intercept) 612.10 24.741
## Days 35.07 5.922 0.07
## Residual 654.94 25.592
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 251.405 6.825 36.838
## Days 10.467 1.546 6.771
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.138
datafit=fitted(model4)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) +
geom_point() +
geom_line(aes(y=datafit), linetype=2) +
facet_wrap(~Subject, ncol=9) +
scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
theme_minimal()
mlm_model_rand_slope <- lmer(Reaction ~ Days + (Days| Subject), data =sleepstudy)
summary(mlm_model_rand_slope)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ Days + (Days | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1743.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9536 -0.4634 0.0231 0.4634 5.1793
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Subject (Intercept) 612.10 24.741
## Days 35.07 5.922 0.07
## Residual 654.94 25.592
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 251.405 6.825 36.838
## Days 10.467 1.546 6.771
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.138
library(ggeffects)
PRED <- ggpredict(mlm_model_rand_slope, terms=c('Days'))
plot(PRED)
#fixef(mlm_model_rand_slope)
#ranef(mlm_model_rand_slope)
What if we just did a separate regression for each subject
# Fit separate models for each Subject
separate_models <- sleepstudy %>%
group_by(Subject) %>%
do(model = lm(Reaction ~ Days, data = .))
# Extract the slope estimates for each subject
separate_slopes <- separate_models %>%
mutate(
intercept = coef(model)[1],
slope = coef(model)[2]
) %>%
select(Subject, intercept, slope)
print(separate_slopes)
## # A tibble: 18 × 3
## # Rowwise:
## Subject intercept slope
## <fct> <dbl> <dbl>
## 1 308 244. 21.8
## 2 309 205. 2.26
## 3 310 203. 6.11
## 4 330 290. 3.01
## 5 331 286. 5.27
## 6 332 264. 9.57
## 7 333 275. 9.14
## 8 334 240. 12.3
## 9 335 263. -2.88
## 10 337 290. 19.0
## 11 349 215. 13.5
## 12 350 226. 19.5
## 13 351 261. 6.43
## 14 352 276. 13.6
## 15 369 255. 11.3
## 16 370 210. 18.1
## 17 371 254. 9.19
## 18 372 267. 11.3
# Random slopes from lmer
ranef_slopes <- ranef(mlm_model_rand_slope)$Subject %>%
tibble::rownames_to_column("Subject") %>%
select(Subject, slope = Days)
# Fixed effect slope
fixed_slope <- fixef(mlm_model_rand_slope)["Days"]
# Compute total slopes for each subject in lmer
ranef_slopes <- ranef_slopes %>%
mutate(total_slope = fixed_slope + slope)
print(ranef_slopes)
## Subject slope total_slope
## 1 308 9.1989758 19.6662617
## 2 309 -8.6196806 1.8476053
## 3 310 -5.4488565 5.0184295
## 4 330 -4.8143503 5.6529356
## 5 331 -3.0699116 7.3973743
## 6 332 -0.2721770 10.1951090
## 7 333 -0.2236361 10.2436499
## 8 334 1.0745816 11.5418676
## 9 335 -10.7521652 -0.2848792
## 10 337 8.6282652 19.0955511
## 11 349 1.1734322 11.6407181
## 12 350 6.6142178 17.0815038
## 13 351 -3.0152621 7.4520239
## 14 352 3.5360011 14.0032871
## 15 369 0.8722149 11.3395008
## 16 370 4.8224850 15.2897709
## 17 371 -0.9881562 9.4791297
## 18 372 1.2840221 11.7513080
comparison <- separate_slopes %>%
left_join(ranef_slopes, by = "Subject")
print(comparison)
## # A tibble: 18 × 5
## # Rowwise:
## Subject intercept slope.x slope.y total_slope
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 308 244. 21.8 9.20 19.7
## 2 309 205. 2.26 -8.62 1.85
## 3 310 203. 6.11 -5.45 5.02
## 4 330 290. 3.01 -4.81 5.65
## 5 331 286. 5.27 -3.07 7.40
## 6 332 264. 9.57 -0.272 10.2
## 7 333 275. 9.14 -0.224 10.2
## 8 334 240. 12.3 1.07 11.5
## 9 335 263. -2.88 -10.8 -0.285
## 10 337 290. 19.0 8.63 19.1
## 11 349 215. 13.5 1.17 11.6
## 12 350 226. 19.5 6.61 17.1
## 13 351 261. 6.43 -3.02 7.45
## 14 352 276. 13.6 3.54 14.0
## 15 369 255. 11.3 0.872 11.3
## 16 370 210. 18.1 4.82 15.3
## 17 371 254. 9.19 -0.988 9.48
## 18 372 267. 11.3 1.28 11.8
For mixed effect models https://www.bristol.ac.uk/cmm/learning/online-course/ - great course # also great resource - https://stats.oarc.ucla.edu/other/mult-pkg/introduction-to-linear-mixed-models/ More detailed course - https://m-clark.github.io/mixed-models-with-R/random_intercepts.html
Practical example - https://osf.io/s3d5z/files/osfstorage see Paper_Analyses.html linked to this paper doi:10.1037/xge0001462
Disclaimer the fact ggplot relies on specific data structure e.g., dataframe can make it a bit less flexible when you want to add multiple columns on top of each other - e.g., hold on in Matlab https://www.geeksforgeeks.org/draw-multiple-overlaid-histograms-with-ggplot2-package-in-r/
ggplot() + geom_histogram(data = iris,aes(Petal.Length), alpha = 0.5, fill="red") +
geom_histogram(data =iris, aes(Sepal.Length), alpha = 0.5, fill="skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Determine the range for the x-axis
# x_min <- min(iris$Petal.Length, iris$Sepal.Length)
# x_max <- max(iris$Petal.Length, iris$Sepal.Length)
#
# # Create the first histogram for Petal.Length
# hist(iris$Petal.Length,
# breaks = 20,
# col = rgb(1, 0, 0, 0.5), # Semi-transparent red
# xlim = c(x_min, x_max),
# xlab = "Length (cm)",
# ylab = "Frequency",
# main = "Overlay of Petal and Sepal Lengths")
#
# # Overlay the second histogram for Sepal.Length
# hist(iris$Sepal.Length,
# breaks = 20,
# col = rgb(0, 0, 1, 0.5), # Semi-transparent blue
# add = TRUE)
#
# # Add a legend to distinguish the histograms
# legend("topright",
# legend = c("Petal Length", "Sepal Length"),
# fill = c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)))
# updated for newer versions of ggplot -
# plot_histogram <- function(df, feature) {
# plt <- ggplot(df, aes(x=eval(parse(text=feature)))) +
# geom_histogram(aes(y = ..density..), alpha=0.7, fill="#33AADE", color="black") +
# geom_density(alpha=0.3, fill="red") +
# geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
# labs(x=feature, y = "Density")
# print(plt)
# }
## eval(parse(text="2+2")) / # eval(2+2) - this is a trick with which we can evaluate the feature argument as a column name
## {{feature }} - is the tidy version of evaluation
plot_histogram <- function(df, feature) {
ggplot(df, aes(x = {{ feature }})) +
geom_histogram(aes(y = after_stat(density)), alpha = 0.7, fill = "#33AADE", color = "black") +
geom_density(alpha = 0.3, fill = "red") +
geom_vline(aes(xintercept = mean({{ feature }})), color = "black", linetype = "dashed", size = 1) +
labs(
title = paste("Histogram with Density for", rlang::as_name(rlang::ensym(feature))),
x = rlang::as_name(rlang::ensym(feature)),
y = "Density"
) +
theme_minimal()
}
plot_multi_histogram <- function(df, feature, label_column) {
ggplot(df, aes(x = {{ feature }}, fill = {{ label_column }})) +
geom_histogram(
aes(y = after_stat(density)),
position = "identity",
alpha = 0.6,
color = "black",
bins = 30
) +
geom_density(alpha = 0.4) +
geom_vline(
aes(xintercept = mean({{ feature }})),
color = "black",
linetype = "dashed",
size = 1
) +
labs(
title = paste("Histogram of", rlang::as_name(rlang::ensym(feature)), "by", rlang::as_name(rlang::ensym(label_column))),
x = rlang::as_name(rlang::ensym(feature)),
y = "Density",
fill = rlang::as_name(rlang::ensym(label_column))
) +
theme_minimal()
}
plot_histogram(iris, Petal.Length)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_multi_histogram_with_group_means <- function(df, value_col, group_col) {
value_col <- rlang::ensym(value_col)
group_col <- rlang::ensym(group_col)
# Step 1: Compute group-wise means
group_means <- df %>%
group_by(!!group_col) %>%
summarize(mean_value = mean(!!value_col), .groups = "drop")
# Step 2: Plot histogram + group-wise mean lines
ggplot(df, aes(x = !!value_col, fill = !!group_col)) +
geom_histogram(aes(y = after_stat(density)), alpha = 0.6, position = "identity", color = "black", bins = 30) +
geom_density(alpha = 0.3) +
geom_vline(
data = group_means,
aes(xintercept = mean_value, color = !!group_col),
linetype = "dashed",
size = 1,
inherit.aes = FALSE,
show.legend = FALSE
) +
labs(
title = paste("Histogram of", rlang::as_name(value_col), "by", rlang::as_name(group_col)),
x = rlang::as_name(value_col),
y = "Density",
fill = rlang::as_name(group_col)
#color = paste("Mean of", rlang::as_name(group_col))
) +
theme_minimal()
}
a <- data.frame(n = rnorm(1000, mean = 1), category = "A")
b <- data.frame(n = rnorm(1000, mean = 2), category = "B")
c <- data.frame(n = rnorm(1000, mean = 3), category = "C")
d <- data.frame(n = rnorm(1000, mean = 4), category = "D")
e <- data.frame(n = rnorm(1000, mean = 5), category = "E")
f <- data.frame(n = rnorm(1000, mean = 6), category = "F")
many_distros <- bind_rows(a, b, c, d, e, f)
plot_multi_histogram_with_group_means(many_distros, n, category)
## Warning in geom_vline(data = group_means, aes(xintercept = mean_value, color = !!group_col), : Ignoring unknown parameters:
## `inherit.aes`